# import libraries
import pandas as pd
import seaborn as sns
import missingno as msno
import statsmodels.formula.api as smf
import statsmodels.api as sm
import matplotlib.pyplot as plt
import matplotlib
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import shap
from sklearn.model_selection import train_test_split
import plotly.express as px
# init shap
shap.initjs()
# change matplotlib style
plt.style.use('fivethirtyeight')
# set global font-size
matplotlib.rcParams.update({'font.size': 14})
# set font-scale for seaborn chart
sns.set(font_scale=1.5)
# change seaborn plot style
sns.set_style("whitegrid")
# read csv file
used_car_df = pd.read_csv('/Users/zhaoyudong/Downloads/vehicles.csv')
used_car_df.shape
(426880, 26)
used_car_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 426880 entries, 0 to 426879 Data columns (total 26 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 426880 non-null int64 1 url 426880 non-null object 2 region 426880 non-null object 3 region_url 426880 non-null object 4 price 426880 non-null int64 5 year 425675 non-null float64 6 manufacturer 409234 non-null object 7 model 421603 non-null object 8 condition 252776 non-null object 9 cylinders 249202 non-null object 10 fuel 423867 non-null object 11 odometer 422480 non-null float64 12 title_status 418638 non-null object 13 transmission 424324 non-null object 14 VIN 265838 non-null object 15 drive 296313 non-null object 16 size 120519 non-null object 17 type 334022 non-null object 18 paint_color 296677 non-null object 19 image_url 426812 non-null object 20 description 426810 non-null object 21 county 0 non-null float64 22 state 426880 non-null object 23 lat 420331 non-null float64 24 long 420331 non-null float64 25 posting_date 426812 non-null object dtypes: float64(5), int64(2), object(19) memory usage: 84.7+ MB
# check null value
used_car_df.isnull().sum(axis = 0)
id 0 url 0 region 0 region_url 0 price 0 year 1205 manufacturer 17646 model 5277 condition 174104 cylinders 177678 fuel 3013 odometer 4400 title_status 8242 transmission 2556 VIN 161042 drive 130567 size 306361 type 92858 paint_color 130203 image_url 68 description 70 county 426880 state 0 lat 6549 long 6549 posting_date 68 dtype: int64
# head 5 row
used_car_df.head()
| id | url | region | region_url | price | year | manufacturer | model | condition | cylinders | ... | size | type | paint_color | image_url | description | county | state | lat | long | posting_date | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7222695916 | https://prescott.craigslist.org/cto/d/prescott... | prescott | https://prescott.craigslist.org | 6000 | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | az | NaN | NaN | NaN |
| 1 | 7218891961 | https://fayar.craigslist.org/ctd/d/bentonville... | fayetteville | https://fayar.craigslist.org | 11900 | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | ar | NaN | NaN | NaN |
| 2 | 7221797935 | https://keys.craigslist.org/cto/d/summerland-k... | florida keys | https://keys.craigslist.org | 21000 | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | fl | NaN | NaN | NaN |
| 3 | 7222270760 | https://worcester.craigslist.org/cto/d/west-br... | worcester / central MA | https://worcester.craigslist.org | 1500 | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | ma | NaN | NaN | NaN |
| 4 | 7210384030 | https://greensboro.craigslist.org/cto/d/trinit... | greensboro | https://greensboro.craigslist.org | 4900 | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | nc | NaN | NaN | NaN |
5 rows × 26 columns
# backup dataset
used_car_df1 = used_car_df.copy()
# drop unralted columns
used_car_df = used_car_df.loc[:, ['price','year','manufacturer','condition','cylinders','fuel',
'odometer','transmission','drive','size','type','paint_color','state']]
# show matrix plot of different fields, check null values percentage
msno.matrix(used_car_df, figsize=(13, 5), color=(0,0,0.5))
<AxesSubplot:>
# drop null values
used_car_df = used_car_df.dropna(subset=['price','year','condition','manufacturer',
'cylinders','fuel','odometer','transmission',
'drive','paint_color','type','size'], how='any')
used_car_df.shape
(80170, 13)
used_car_df.head()
| price | year | manufacturer | condition | cylinders | fuel | odometer | transmission | drive | size | type | paint_color | state | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 31 | 15000 | 2013.0 | ford | excellent | 6 cylinders | gas | 128000.0 | automatic | rwd | full-size | truck | black | al |
| 55 | 19900 | 2004.0 | ford | good | 8 cylinders | diesel | 88000.0 | automatic | 4wd | full-size | pickup | blue | al |
| 59 | 14000 | 2012.0 | honda | excellent | 6 cylinders | gas | 95000.0 | automatic | fwd | full-size | mini-van | silver | al |
| 65 | 22500 | 2001.0 | ford | good | 8 cylinders | diesel | 144700.0 | manual | rwd | full-size | truck | white | al |
| 73 | 15000 | 2017.0 | dodge | excellent | 8 cylinders | gas | 90000.0 | automatic | rwd | mid-size | sedan | grey | al |
plt.boxplot(used_car_df.odometer)
plt.xlabel('odometer')
plt.show()
plt.boxplot(used_car_df.price)
plt.xlabel('price')
plt.show()
plt.hist(used_car_df['price'],bins=60)
plt.xlabel('price')
plt.title('Distribution of price')
plt.show()
plt.hist(used_car_df['odometer'],bins=60)
plt.xlabel('odometer')
plt.title('Distribution of odometer')
plt.show()
cols = ['price', 'odometer']
used_car_df[cols].describe()
| price | odometer | |
|---|---|---|
| count | 8.017000e+04 | 8.017000e+04 |
| mean | 7.953919e+04 | 1.248147e+05 |
| std | 1.379153e+07 | 2.494849e+05 |
| min | 0.000000e+00 | 0.000000e+00 |
| 25% | 4.950000e+03 | 7.300000e+04 |
| 50% | 9.000000e+03 | 1.140000e+05 |
| 75% | 1.750000e+04 | 1.554675e+05 |
| max | 3.736929e+09 | 1.000000e+07 |
# trim outlier
upper_limit = used_car_df[cols].quantile(0.99)
lower_limit = used_car_df[cols].quantile(0.01)
used_car_df_trimed = used_car_df[~(((used_car_df[cols] <= lower_limit)
|(used_car_df[cols] >= upper_limit)).any(axis=1))]
used_car_df_trimed.shape
(74476, 13)
### define a group of tools
def draw_multiple_price_boxplots(df, group_field, title='', xlabel='', filter_labels=[], orient="v"):
if len(filter_labels) != 0:
df = df[df[group_field].isin(filter_labels)]
size = (8, 8)
if len(df[group_field].unique()) >= 8:
size = (12, 8) if orient != 'h' else (8, 12)
fig,ax = plt.subplots(figsize=size)
x,y = group_field, "price"
if orient == 'h':
x,y = y,x
sns.boxplot(x=x, y=y, data=df, showfliers=False, ax=ax, orient=orient)
ax.set(
axisbelow=True, # Hide the grid behind plot objects
title=title
)
def _draw_bar_plot(df, field, title='', xlabel='', ylabel=''):
fig, ax = plt.subplots(figsize=(12,10))
ax.bar(df[field], df['price'], width=1, edgecolor="white", linewidth=0.7)
ax.set(
title=title,
xlabel=xlabel,
ylabel=ylabel
)
plt.show()
def draw_bar_plot(df, group_field, title='', xlabel='', ylabel=''):
grouped = df.groupby([group_field]).count()
grouped = grouped.reset_index()
_draw_bar_plot(grouped, group_field, title, xlabel, ylabel)
def draw_head_bar_plot(df, group_field, head=10, title='', xlabel='', ylabel=''):
grouped = df.groupby([group_field]).count()
grouped = grouped.reset_index()
grouped = grouped.head(head)
_draw_bar_plot(grouped, group_field, title, xlabel, ylabel)
def draw_pie_plot(df, group_field, title=''):
grouped = df[group_field].value_counts()/len(df) * 100
fig, ax = plt.subplots(figsize=(10,10))
ax.pie(grouped)
labels = [f'{l}, {s:0.1f}%' for l, s in zip(grouped.index, grouped)]
plt.legend(bbox_to_anchor=(1,0.5), loc="center right", labels=labels,
bbox_transform=plt.gcf().transFigure)
plt.savefig("output.png", bbox_inches="tight")
plt.subplots_adjust(left=0.0, bottom=0.2, right=0.9)
plt.title(title)
plt.show()
def draw_line_plot(x, y):
fig, ax = plt.subplots(figsize=(10,10))
ax.plot(x, y, linewidth=2.0)
plt.show()
# price hist
fig,ax = plt.subplots(figsize=(16,10))
sns.histplot(data=used_car_df_trimed, x="price", kde=True, ax=ax)
ax.set(title='Frequency Distribution of Price')
[Text(0.5, 1.0, 'Frequency Distribution of Price')]
# year and confition
fig,ax = plt.subplots(figsize=(16,10))
sns.histplot(data=used_car_df_trimed,
x="year", hue="condition",kde=True, multiple="stack", ax=ax)
ax.set(title='Frequency Distribution of Year')
[Text(0.5, 1.0, 'Frequency Distribution of Year')]
grouped_by_year_df = used_car_df_trimed.groupby(['year'])
grouped_by_year_df.price.describe()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| year | ||||||||
| 1900.0 | 1.0 | 38250.000000 | NaN | 38250.0 | 38250.0 | 38250.0 | 38250.0 | 38250.0 |
| 1905.0 | 1.0 | 3990.000000 | NaN | 3990.0 | 3990.0 | 3990.0 | 3990.0 | 3990.0 |
| 1916.0 | 1.0 | 12000.000000 | NaN | 12000.0 | 12000.0 | 12000.0 | 12000.0 | 12000.0 |
| 1918.0 | 1.0 | 16000.000000 | NaN | 16000.0 | 16000.0 | 16000.0 | 16000.0 | 16000.0 |
| 1923.0 | 5.0 | 20298.000000 | 4522.888458 | 15000.0 | 17500.0 | 18990.0 | 25000.0 | 25000.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2017.0 | 3399.0 | 22906.838188 | 11897.517403 | 1.0 | 14896.5 | 20980.0 | 31500.0 | 55900.0 |
| 2018.0 | 2620.0 | 26285.083969 | 13062.719164 | 1.0 | 17450.0 | 25256.0 | 36490.0 | 56000.0 |
| 2019.0 | 1622.0 | 28541.700986 | 13234.619328 | 1.0 | 19995.0 | 29900.0 | 36700.0 | 56499.0 |
| 2020.0 | 745.0 | 30739.477852 | 12365.196406 | 1.0 | 21900.0 | 30000.0 | 38500.0 | 56000.0 |
| 2021.0 | 83.0 | 33243.554217 | 14177.420484 | 50.0 | 24995.0 | 32900.0 | 44750.0 | 56000.0 |
101 rows × 8 columns
used_car_df_trimed.odometer.describe()
count 74476.000000 mean 117839.591949 std 58083.058896 min 203.000000 25% 76278.000000 50% 115831.000000 75% 156000.000000 max 295747.000000 Name: odometer, dtype: float64
plt.boxplot(used_car_df_trimed.odometer)
plt.xlabel('odometer')
plt.show()
# odometer hist
fig,ax = plt.subplots(figsize=(16,10))
sns.histplot(data=used_car_df_trimed, x="odometer", kde=True, ax=ax)
ax.set(title='Frequency Distribution of Odometer')
[Text(0.5, 1.0, 'Frequency Distribution of Odometer')]
sns.jointplot(x=used_car_df_trimed['odometer'], y=used_car_df_trimed['price'], height=10, kind="hex", color="#4CB391")
<seaborn.axisgrid.JointGrid at 0x7f8ce9f2c4f0>
used_car_df_trimed.condition.value_counts()
excellent 36758 good 24453 like new 9777 fair 2907 new 356 salvage 225 Name: condition, dtype: int64
used_car_df_trimed.condition.unique()
array(['excellent', 'good', 'new', 'fair', 'like new', 'salvage'],
dtype=object)
grouped_by_condition_df = used_car_df_trimed.groupby(['condition'])
grouped_by_condition_df.price.describe()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| condition | ||||||||
| excellent | 36758.0 | 14202.414903 | 10603.938666 | 1.0 | 6600.0 | 10995.0 | 18900.0 | 56000.0 |
| fair | 2907.0 | 3798.018232 | 3651.584091 | 1.0 | 1550.0 | 2800.0 | 4700.0 | 55000.0 |
| good | 24453.0 | 10782.673578 | 9567.596169 | 1.0 | 4400.0 | 7490.0 | 13900.0 | 56000.0 |
| like new | 9777.0 | 17956.718728 | 12579.254665 | 1.0 | 7995.0 | 14900.0 | 25750.0 | 56499.0 |
| new | 356.0 | 18389.994382 | 15387.959027 | 1.0 | 5995.0 | 13997.0 | 29050.0 | 55000.0 |
| salvage | 225.0 | 4045.231111 | 4590.818723 | 1.0 | 1000.0 | 2000.0 | 5900.0 | 25000.0 |
draw_pie_plot(used_car_df_trimed, 'condition', title='Piechart of Condition')
draw_multiple_price_boxplots(used_car_df_trimed, 'condition',
'Boxplots of price of different fuel', 'fuel')
sns.catplot(
data=used_car_df_trimed, kind="bar",
x="condition", y="price", ax=ax,
ci="sd", palette="dark", alpha=.6, height=8
)
<seaborn.axisgrid.FacetGrid at 0x7f8ceb9a7d60>
used_car_df_trimed.manufacturer.unique()
array(['ford', 'honda', 'dodge', 'chrysler', 'toyota', 'jeep', 'lexus',
'chevrolet', 'bmw', 'gmc', 'mercedes-benz', 'mazda', 'rover',
'ram', 'nissan', 'audi', 'mitsubishi', 'infiniti', 'volkswagen',
'kia', 'hyundai', 'fiat', 'acura', 'cadillac', 'lincoln', 'jaguar',
'saturn', 'volvo', 'alfa-romeo', 'buick', 'subaru', 'mini',
'pontiac', 'harley-davidson', 'porsche', 'tesla', 'mercury',
'datsun', 'land rover', 'aston-martin', 'ferrari'], dtype=object)
used_car_df_trimed.manufacturer.value_counts()
ford 14383 chevrolet 11170 toyota 6549 honda 4657 nissan 3809 jeep 3064 gmc 2899 ram 2658 dodge 2357 bmw 2101 subaru 1901 mercedes-benz 1813 volkswagen 1717 hyundai 1667 kia 1315 chrysler 1309 lexus 1198 cadillac 1151 buick 1000 mazda 940 audi 867 pontiac 700 acura 669 lincoln 630 infiniti 615 mitsubishi 596 volvo 579 mini 411 mercury 390 rover 355 saturn 341 porsche 221 jaguar 208 fiat 125 harley-davidson 31 alfa-romeo 28 tesla 25 datsun 14 land rover 8 aston-martin 3 ferrari 2 Name: manufacturer, dtype: int64
grouped_by_manu_df = used_car_df_trimed.groupby(['manufacturer'])
grouped_by_manu_df.price.describe()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| manufacturer | ||||||||
| acura | 669.0 | 9385.112108 | 7204.492183 | 1.0 | 4700.00 | 7395.0 | 11999.00 | 46567.0 |
| alfa-romeo | 28.0 | 21795.964286 | 12485.364651 | 2000.0 | 9375.00 | 26800.0 | 28887.50 | 54998.0 |
| aston-martin | 3.0 | 42631.666667 | 9555.182276 | 32900.0 | 37947.50 | 42995.0 | 47497.50 | 52000.0 |
| audi | 867.0 | 13087.080738 | 8960.137121 | 1.0 | 6500.00 | 10999.0 | 16995.00 | 48000.0 |
| bmw | 2101.0 | 12371.571157 | 9771.248331 | 1.0 | 5900.00 | 9998.0 | 16350.00 | 55000.0 |
| buick | 1000.0 | 9640.080000 | 8413.417134 | 1.0 | 3950.00 | 6999.0 | 12995.00 | 50000.0 |
| cadillac | 1151.0 | 11637.825369 | 9539.912961 | 1.0 | 4999.00 | 9991.0 | 15000.00 | 53995.0 |
| chevrolet | 11170.0 | 14983.314772 | 11483.029943 | 1.0 | 5999.00 | 11750.0 | 21700.00 | 56000.0 |
| chrysler | 1309.0 | 7657.862490 | 5736.425782 | 1.0 | 3695.00 | 6250.0 | 9990.00 | 51500.0 |
| datsun | 14.0 | 13434.928571 | 7516.606082 | 2500.0 | 8248.00 | 15000.0 | 18800.00 | 26500.0 |
| dodge | 2357.0 | 10207.248197 | 9148.096780 | 1.0 | 4200.00 | 7500.0 | 12995.00 | 55900.0 |
| ferrari | 2.0 | 21500.500000 | 30404.884484 | 1.0 | 10750.75 | 21500.5 | 32250.25 | 43000.0 |
| fiat | 125.0 | 9099.544000 | 5357.066749 | 1.0 | 5500.00 | 7500.0 | 10950.00 | 32500.0 |
| ford | 14383.0 | 16129.381701 | 12363.794811 | 1.0 | 6500.00 | 12500.0 | 23495.00 | 56499.0 |
| gmc | 2899.0 | 18741.705761 | 13102.827331 | 1.0 | 8495.00 | 15000.0 | 27500.00 | 55680.0 |
| harley-davidson | 31.0 | 13801.612903 | 5332.392213 | 3000.0 | 11975.00 | 11975.0 | 17475.00 | 25950.0 |
| honda | 4657.0 | 8878.240713 | 6413.336155 | 1.0 | 4500.00 | 7000.0 | 11900.00 | 55000.0 |
| hyundai | 1667.0 | 8250.660468 | 5090.428855 | 1.0 | 4500.00 | 7400.0 | 10900.00 | 35995.0 |
| infiniti | 615.0 | 12112.582114 | 8990.761344 | 1.0 | 5800.00 | 9500.0 | 16997.00 | 52000.0 |
| jaguar | 208.0 | 11555.153846 | 9578.759443 | 1.0 | 5298.75 | 7900.0 | 14988.50 | 42995.0 |
| jeep | 3064.0 | 13772.554830 | 10399.795310 | 1.0 | 5500.00 | 11984.0 | 19900.00 | 54995.0 |
| kia | 1315.0 | 8650.286692 | 5685.056397 | 30.0 | 4900.00 | 7898.0 | 11000.00 | 55000.0 |
| land rover | 8.0 | 18116.875000 | 20783.590013 | 3950.0 | 6371.25 | 8247.5 | 19498.75 | 55000.0 |
| lexus | 1198.0 | 13183.651920 | 9573.382715 | 1.0 | 6861.25 | 10500.0 | 16998.00 | 51000.0 |
| lincoln | 630.0 | 9713.228571 | 7182.786736 | 1.0 | 4800.00 | 7797.5 | 13966.00 | 50985.0 |
| mazda | 940.0 | 8399.646809 | 5790.062169 | 1.0 | 4500.00 | 6925.0 | 9999.25 | 34500.0 |
| mercedes-benz | 1813.0 | 13116.517375 | 10631.268623 | 1.0 | 5850.00 | 9995.0 | 17995.00 | 55000.0 |
| mercury | 390.0 | 5601.289744 | 4978.931002 | 1.0 | 2756.25 | 3999.0 | 6495.00 | 35000.0 |
| mini | 411.0 | 9292.732360 | 5223.749597 | 1.0 | 5950.00 | 7995.0 | 10900.00 | 34595.0 |
| mitsubishi | 596.0 | 13272.395973 | 11462.055735 | 1.0 | 4150.00 | 7999.5 | 24990.00 | 39900.0 |
| nissan | 3809.0 | 9676.215280 | 6932.500816 | 1.0 | 4990.00 | 7995.0 | 12900.00 | 51000.0 |
| pontiac | 700.0 | 8142.174286 | 8275.697440 | 1.0 | 3500.00 | 4995.0 | 8525.00 | 46500.0 |
| porsche | 221.0 | 21236.963801 | 12373.857633 | 1.0 | 12000.00 | 18000.0 | 28000.00 | 56000.0 |
| ram | 2658.0 | 22670.181716 | 13884.901137 | 1.0 | 11000.00 | 20990.0 | 32995.00 | 56000.0 |
| rover | 355.0 | 18255.811268 | 13373.402910 | 1.0 | 8000.00 | 15500.0 | 23900.00 | 55812.0 |
| saturn | 341.0 | 4536.299120 | 2863.110247 | 300.0 | 2995.00 | 3995.0 | 5200.00 | 22000.0 |
| subaru | 1901.0 | 10301.986323 | 6855.840268 | 1.0 | 5499.00 | 8500.0 | 13990.00 | 43995.0 |
| tesla | 25.0 | 34804.400000 | 15983.478248 | 1.0 | 28500.00 | 36400.0 | 45000.00 | 54750.0 |
| toyota | 6549.0 | 12214.268133 | 9962.557170 | 1.0 | 5460.00 | 8900.0 | 15900.00 | 55000.0 |
| volkswagen | 1717.0 | 9278.158998 | 6120.498103 | 1.0 | 5000.00 | 7950.0 | 11995.00 | 40000.0 |
| volvo | 579.0 | 8188.341969 | 7972.708722 | 1.0 | 3550.00 | 5900.0 | 9850.00 | 52991.0 |
draw_head_bar_plot(used_car_df_trimed, 'manufacturer', title = 'Barchart of Manufacturer')
top_10_manufacturer = used_car_df_trimed.manufacturer.value_counts().head(10).index.tolist()
draw_multiple_price_boxplots(used_car_df_trimed, 'manufacturer',
'Boxplots of price of different manufacturer',
'manufacturer',filter_labels=top_10_manufacturer)
used_car_df_trimed.fuel.unique()
array(['gas', 'diesel', 'hybrid', 'electric', 'other'], dtype=object)
used_car_df_trimed.fuel.value_counts()
gas 68714 diesel 4633 hybrid 920 electric 117 other 92 Name: fuel, dtype: int64
grouped_by_fuel_df = used_car_df_trimed.groupby(['fuel'])
grouped_by_fuel_df.price.describe()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| fuel | ||||||||
| diesel | 4633.0 | 27146.822577 | 14315.667955 | 1.0 | 15000.0 | 26990.0 | 38900.00 | 56499.0 |
| electric | 117.0 | 18387.470085 | 14846.362473 | 1100.0 | 7200.0 | 12999.0 | 23500.00 | 54750.0 |
| gas | 68714.0 | 12243.165323 | 9891.205969 | 1.0 | 5000.0 | 8997.0 | 16500.00 | 56000.0 |
| hybrid | 920.0 | 10334.448913 | 7614.810203 | 1.0 | 5700.0 | 7998.5 | 12499.00 | 49900.0 |
| other | 92.0 | 11681.076087 | 8501.509141 | 5.0 | 4937.5 | 9900.0 | 15923.75 | 45000.0 |
used_car_df_trimed.fuel.value_counts().index
Index(['gas', 'diesel', 'hybrid', 'electric', 'other'], dtype='object')
draw_pie_plot(used_car_df_trimed,'fuel', "Piechart of Fuel Types")
draw_multiple_price_boxplots(used_car_df_trimed, 'fuel',
'Boxplots of price of different fuel', 'fuel')
sns.catplot(
data=used_car_df_trimed, kind="bar",
x="fuel", y="price", ax=ax,
ci="sd", palette="dark", alpha=.6, height=8
)
<seaborn.axisgrid.FacetGrid at 0x7f8ceb95daf0>
used_car_df_trimed.drive.unique()
array(['rwd', '4wd', 'fwd'], dtype=object)
used_car_df_trimed.drive.value_counts()
4wd 30985 fwd 27000 rwd 16491 Name: drive, dtype: int64
grouped_by_drive_df = used_car_df_trimed.groupby(['drive'])
grouped_by_drive_df.price.describe()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| drive | ||||||||
| 4wd | 30985.0 | 16785.611522 | 12418.355071 | 1.0 | 7000.0 | 13900.0 | 23995.0 | 56499.0 |
| fwd | 27000.0 | 8184.754296 | 5911.396897 | 1.0 | 4200.0 | 6888.0 | 10495.0 | 56000.0 |
| rwd | 16491.0 | 14474.030562 | 10752.132758 | 1.0 | 6000.0 | 11500.0 | 20825.0 | 55995.0 |
draw_pie_plot(used_car_df_trimed, 'drive', "Piechart of Drive")
draw_bar_plot(used_car_df_trimed, 'drive', "Piechart of Drive")
draw_multiple_price_boxplots(used_car_df_trimed, 'drive',
'Boxplots of price of different drive', 'drive')
sns.catplot(
data=used_car_df_trimed, kind="bar",
x="drive", y="price", ax=ax,
ci="sd", palette="dark", alpha=.6, height=8
)
<seaborn.axisgrid.FacetGrid at 0x7f8ce6d16490>
used_car_df_trimed.paint_color.unique()
array(['black', 'blue', 'silver', 'white', 'grey', 'yellow', 'red',
'green', 'brown', 'purple', 'custom', 'orange'], dtype=object)
used_car_df_trimed.paint_color.value_counts()
white 17963 black 13015 silver 10901 grey 8993 blue 7978 red 7507 green 2546 brown 2370 custom 1932 yellow 602 orange 423 purple 246 Name: paint_color, dtype: int64
grouped_by_color_df = used_car_df_trimed.groupby(['paint_color'])
grouped_by_color_df.price.describe()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| paint_color | ||||||||
| black | 13015.0 | 14352.902497 | 11278.745947 | 1.0 | 6200.0 | 10995.0 | 19000.00 | 56000.0 |
| blue | 7978.0 | 11125.945976 | 9245.276472 | 1.0 | 4900.0 | 7995.0 | 14900.00 | 55000.0 |
| brown | 2370.0 | 10074.659494 | 8849.902685 | 1.0 | 4062.5 | 7490.0 | 12990.00 | 52995.0 |
| custom | 1932.0 | 12265.525880 | 10355.845933 | 1.0 | 4999.0 | 8894.0 | 15995.00 | 55000.0 |
| green | 2546.0 | 9446.562058 | 9063.892368 | 1.0 | 3500.0 | 6200.0 | 11950.00 | 55950.0 |
| grey | 8993.0 | 12450.698543 | 9978.807525 | 1.0 | 5500.0 | 9300.0 | 16000.00 | 56000.0 |
| orange | 423.0 | 14776.439716 | 10962.077589 | 1.0 | 5999.5 | 11995.0 | 21000.00 | 55000.0 |
| purple | 246.0 | 9423.300813 | 9492.141901 | 200.0 | 3815.0 | 6000.0 | 10999.75 | 55000.0 |
| red | 7507.0 | 12494.775410 | 10633.082820 | 1.0 | 4995.0 | 8995.0 | 16500.00 | 55500.0 |
| silver | 10901.0 | 10948.618751 | 9228.395040 | 1.0 | 4900.0 | 7950.0 | 13999.00 | 56000.0 |
| white | 17963.0 | 16202.734398 | 12136.043584 | 1.0 | 6850.0 | 12988.0 | 23900.00 | 56499.0 |
| yellow | 602.0 | 13047.308970 | 9970.903523 | 1.0 | 5500.0 | 9997.5 | 18000.00 | 54500.0 |
draw_pie_plot(used_car_df_trimed,'paint_color', "Piechart of Paint Color")
draw_bar_plot(used_car_df_trimed,'paint_color', "Bar Chart of Paint Color")
draw_multiple_price_boxplots(used_car_df_trimed, 'paint_color',
'Boxplots of price of different paint color', 'paint_color')
sns.catplot(
data=used_car_df_trimed, kind="bar",
x="paint_color", y="price", ax=ax,
ci="sd", palette="dark", alpha=.6, height=8, aspect=1.5
)
<seaborn.axisgrid.FacetGrid at 0x7f8cecccabb0>
used_car_df_trimed.type.unique()
array(['truck', 'pickup', 'mini-van', 'sedan', 'offroad', 'SUV',
'convertible', 'coupe', 'hatchback', 'wagon', 'other', 'van',
'bus'], dtype=object)
used_car_df_trimed.type.value_counts()
sedan 20880 SUV 19061 truck 12182 pickup 6161 coupe 4082 hatchback 3041 van 2621 convertible 2134 mini-van 1794 wagon 1610 other 462 offroad 326 bus 122 Name: type, dtype: int64
grouped_by_type_df = used_car_df_trimed.groupby(['type'])
grouped_by_type_df.price.describe()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| type | ||||||||
| SUV | 19061.0 | 12611.693353 | 9354.211886 | 1.0 | 5975.00 | 9998.0 | 16888.00 | 56000.0 |
| bus | 122.0 | 12264.950820 | 8989.174139 | 175.0 | 4700.00 | 11500.0 | 18495.00 | 32500.0 |
| convertible | 2134.0 | 13840.753046 | 9813.212157 | 1.0 | 6800.00 | 11000.0 | 18500.00 | 55000.0 |
| coupe | 4082.0 | 12576.124449 | 10831.705343 | 1.0 | 4800.00 | 8990.0 | 17500.00 | 55995.0 |
| hatchback | 3041.0 | 8256.286748 | 5636.887320 | 1.0 | 4500.00 | 6900.0 | 10000.00 | 50000.0 |
| mini-van | 1794.0 | 8893.641026 | 7733.397366 | 1.0 | 3896.25 | 6200.0 | 11500.00 | 54000.0 |
| offroad | 326.0 | 16297.392638 | 11216.922923 | 13.0 | 7500.00 | 13500.0 | 23388.00 | 54500.0 |
| other | 462.0 | 10856.374459 | 10011.339976 | 1.0 | 3825.00 | 7997.5 | 14500.00 | 52500.0 |
| pickup | 6161.0 | 17535.816751 | 12220.757982 | 1.0 | 7995.00 | 14994.0 | 24995.00 | 56000.0 |
| sedan | 20880.0 | 8347.595307 | 6445.277854 | 1.0 | 4100.00 | 6950.0 | 10500.00 | 55900.0 |
| truck | 12182.0 | 22077.240108 | 13383.732300 | 1.0 | 10950.00 | 20925.0 | 31900.00 | 56499.0 |
| van | 2621.0 | 15982.635254 | 10370.203447 | 1.0 | 7495.00 | 13995.0 | 22487.00 | 55000.0 |
| wagon | 1610.0 | 7738.832919 | 6616.183921 | 1.0 | 3400.00 | 6498.5 | 10387.25 | 54000.0 |
draw_pie_plot(used_car_df_trimed,'type', "Pie Chart of Types")
draw_bar_plot(used_car_df_trimed,'type', "Bar Chart of Types")
draw_multiple_price_boxplots(used_car_df_trimed, 'type',
'Boxplots of price of different types', 'type', orient="h")
sns.catplot(
data=used_car_df_trimed, kind="bar",
x="type", y="price", ax=ax,
ci="sd", palette="dark", alpha=.6, height=8, aspect=1.5
)
<seaborn.axisgrid.FacetGrid at 0x7f8ce662b340>
used_car_df_trimed.transmission.unique()
array(['automatic', 'manual', 'other'], dtype=object)
used_car_df_trimed.transmission.value_counts()
automatic 68046 manual 5561 other 869 Name: transmission, dtype: int64
grouped_by_transmission_df = used_car_df_trimed.groupby(['transmission'])
grouped_by_transmission_df.price.describe()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| transmission | ||||||||
| automatic | 68046.0 | 13367.647915 | 10880.182871 | 1.0 | 5500.0 | 9900.0 | 17995.0 | 56499.0 |
| manual | 5561.0 | 11426.911707 | 9733.070516 | 1.0 | 4500.0 | 8000.0 | 15900.0 | 55500.0 |
| other | 869.0 | 7619.966628 | 10950.993805 | 1.0 | 1.0 | 100.0 | 12500.0 | 50000.0 |
draw_pie_plot(used_car_df_trimed,'transmission', "PieChart of Transmission Types")
draw_multiple_price_boxplots(used_car_df_trimed, 'transmission',
'Boxplots of price of different transmission', 'transmission')
sns.catplot(
data=used_car_df_trimed, kind="bar",
x="transmission", y="price", ax=ax,
ci="sd", palette="dark", alpha=.6, height=8
)
<seaborn.axisgrid.FacetGrid at 0x7f8ceae3b640>
used_car_df_trimed['size'].unique()
array(['full-size', 'mid-size', 'compact', 'sub-compact'], dtype=object)
used_car_df_trimed['size'].value_counts()
full-size 41399 mid-size 22212 compact 9575 sub-compact 1290 Name: size, dtype: int64
grouped_by_size_df = used_car_df_trimed.groupby(['size'])
grouped_by_size_df.price.describe()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| size | ||||||||
| compact | 9575.0 | 8684.537859 | 7133.975688 | 1.0 | 4000.00 | 6988.0 | 11000.0 | 55995.0 |
| full-size | 41399.0 | 15620.299838 | 12020.057764 | 1.0 | 6495.00 | 11999.0 | 22000.0 | 56499.0 |
| mid-size | 22212.0 | 10713.082433 | 8488.268715 | 1.0 | 4950.00 | 7999.0 | 13995.0 | 56000.0 |
| sub-compact | 1290.0 | 9305.075194 | 7306.001295 | 1.0 | 4599.25 | 6999.0 | 11999.0 | 55000.0 |
draw_pie_plot(used_car_df_trimed,'size', 'Piechart of Size')
draw_multiple_price_boxplots(used_car_df_trimed, 'size', 'Boxplots of price of different size')
sns.catplot(
data=used_car_df_trimed, kind="bar",
x="size", y="price", ax=ax,
ci="sd", palette="dark", alpha=.6, height=8
)
<seaborn.axisgrid.FacetGrid at 0x7f8cebd72d30>
used_car_df_trimed.cylinders.unique()
array(['6 cylinders', '8 cylinders', '4 cylinders', '5 cylinders',
'10 cylinders', '3 cylinders', 'other', '12 cylinders'],
dtype=object)
used_car_df_trimed.cylinders.value_counts()
6 cylinders 25474 4 cylinders 24914 8 cylinders 22231 10 cylinders 757 5 cylinders 697 other 224 3 cylinders 138 12 cylinders 41 Name: cylinders, dtype: int64
grouped_by_cylinders_df = used_car_df_trimed.groupby(['cylinders'])
grouped_by_cylinders_df.price.describe()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| cylinders | ||||||||
| 10 cylinders | 757.0 | 15994.978864 | 15111.774061 | 1.0 | 1.00 | 11997.0 | 26990.00 | 55000.0 |
| 12 cylinders | 41.0 | 15947.634146 | 12236.549912 | 1.0 | 9300.00 | 11500.0 | 19900.00 | 52000.0 |
| 3 cylinders | 138.0 | 9828.985507 | 7740.449053 | 1.0 | 4923.75 | 8995.0 | 10998.50 | 34000.0 |
| 4 cylinders | 24914.0 | 9586.935016 | 6833.039030 | 1.0 | 4975.00 | 7976.0 | 12873.25 | 55995.0 |
| 5 cylinders | 697.0 | 7123.466284 | 5150.060638 | 1.0 | 3988.00 | 6000.0 | 8500.00 | 42000.0 |
| 6 cylinders | 25474.0 | 12638.406964 | 10697.934649 | 1.0 | 4988.00 | 8988.0 | 17495.00 | 56000.0 |
| 8 cylinders | 22231.0 | 17838.730107 | 12669.053547 | 1.0 | 7705.50 | 14975.0 | 25990.00 | 56499.0 |
| other | 224.0 | 14847.241071 | 12483.084141 | 1000.0 | 5950.00 | 10000.0 | 20918.50 | 54750.0 |
draw_pie_plot(used_car_df_trimed, 'cylinders', 'Piechart of Cylinders')
draw_multiple_price_boxplots(used_car_df_trimed, 'cylinders', 'Boxplots of price of different clinders', orient="h")
sns.catplot(
data=used_car_df_trimed, kind="bar",
x="cylinders", y="price", ax=ax,
ci="sd", palette="dark", alpha=.6, height=8, aspect=1.3
)
<seaborn.axisgrid.FacetGrid at 0x7f8cedd11b20>
# create grouped dataframe for plotly
groupby_state_df = used_car_df_trimed.groupby(['state'])
groupby_state_df_count = groupby_state_df.count()
groupby_state_df_count['count'] = groupby_state_df_count['price']
groupby_state_df_count = groupby_state_df_count.reset_index()
groupby_state_df_Mean = groupby_state_df.mean()
groupby_state_df_Mean = groupby_state_df_Mean.reset_index()
groupby_state_df_Mean = groupby_state_df_Mean.loc[:,['state', 'price']].merge(
groupby_state_df_count.loc[:,['state', 'count']], left_on='state', right_on='state')
groupby_state_df_Mean['state'] = groupby_state_df_Mean['state'].str.upper()
# draw U.S. heat map
fig = px.choropleth(groupby_state_df_Mean,
locations="state",
color="count",
hover_name="state", # DataFrame column hover info
locationmode = 'USA-states') # Set to plot as US States
fig.update_layout(
title_text = 'Frequency Distribution',
geo_scope='usa',
)
fig.show()
# draw U.S. heat map
fig = px.choropleth(groupby_state_df_Mean,
locations="state",
color="price",
hover_name="state", # DataFrame column hover info
locationmode = 'USA-states') # Set to plot as US States
fig.update_layout(
title_text = 'Average price of different state',
geo_scope='usa',
)
fig.show()
# create dummies value
def categorical_2_ordered_dummies(df, field, orderList):
df[field] = df[field].astype('category')
df[field] = df[field].cat.reorder_categories(orderList, ordered=True)
df[field] = df[field].cat.codes
## ordered
# conditions
categorical_2_ordered_dummies(used_car_df_trimed, 'condition',
['salvage','fair','good','excellent', 'like new', 'new'])
# size
categorical_2_ordered_dummies(used_car_df_trimed, 'size',
['sub-compact', 'compact', 'mid-size', 'full-size'])
def categorical_one_hot_encoding(df, fields):
prefixs = []
for k in fields:
prefixs.append(k + '_')
return pd.get_dummies(df, columns=fields, prefix=prefixs)
# non-ordered
used_car_df_trimed = categorical_one_hot_encoding(used_car_df_trimed,
['manufacturer','cylinders','fuel','transmission','drive','type','paint_color','state'])
# adjust year
used_car_df_trimed['year'] = 2022 - used_car_df_trimed['year']
# create train data and test data
y = used_car_df_trimed.iloc[:,0]
X = used_car_df_trimed.iloc[:,1:]
X = sm.add_constant(X)
# a simple linear model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.01, random_state=10)
# train statsmodel
model = sm.OLS(y_train, X_train).fit()
# print summary
print(model.summary())
OLS Regression Results
==============================================================================
Dep. Variable: price R-squared: 0.607
Model: OLS Adj. R-squared: 0.606
Method: Least Squares F-statistic: 860.9
Date: Sun, 12 Dec 2021 Prob (F-statistic): 0.00
Time: 01:23:59 Log-Likelihood: -7.5512e+05
No. Observations: 73731 AIC: 1.511e+06
Df Residuals: 73598 BIC: 1.512e+06
Df Model: 132
Covariance Type: nonrobust
=================================================================================================
coef std err t P>|t| [0.025 0.975]
-------------------------------------------------------------------------------------------------
const 9205.9471 162.084 56.797 0.000 8888.263 9523.632
year -239.9513 3.099 -77.426 0.000 -246.026 -233.877
condition 1502.8824 36.222 41.491 0.000 1431.888 1573.877
odometer -0.0830 0.000 -170.769 0.000 -0.084 -0.082
size 850.1660 41.115 20.678 0.000 769.581 930.751
manufacturer__acura 489.4520 317.303 1.543 0.123 -132.460 1111.364
manufacturer__alfa-romeo 3642.4523 1268.236 2.872 0.004 1156.714 6128.190
manufacturer__aston-martin 2.062e+04 3896.439 5.291 0.000 1.3e+04 2.83e+04
manufacturer__audi -818.4172 293.700 -2.787 0.005 -1394.069 -242.766
manufacturer__bmw -1533.7300 236.459 -6.486 0.000 -1997.189 -1070.271
manufacturer__buick -1245.0643 282.321 -4.410 0.000 -1798.413 -691.716
manufacturer__cadillac -1829.7672 271.524 -6.739 0.000 -2361.954 -1297.580
manufacturer__chevrolet -799.4309 196.046 -4.078 0.000 -1183.680 -415.182
manufacturer__chrysler -3294.1199 263.461 -12.503 0.000 -3810.503 -2777.737
manufacturer__datsun 7469.5536 1785.344 4.184 0.000 3970.287 1.1e+04
manufacturer__dodge -2851.7473 230.993 -12.346 0.000 -3304.494 -2399.001
manufacturer__ferrari 698.2963 4719.884 0.148 0.882 -8552.659 9949.252
manufacturer__fiat -5266.7346 632.388 -8.328 0.000 -6506.213 -4027.256
manufacturer__ford -1028.0803 194.215 -5.294 0.000 -1408.741 -647.419
manufacturer__gmc 309.9876 226.097 1.371 0.170 -133.162 753.137
manufacturer__harley-davidson -5573.3800 1208.848 -4.610 0.000 -7942.717 -3204.043
manufacturer__honda 414.1870 210.772 1.965 0.049 1.075 827.299
manufacturer__hyundai -1977.9036 247.703 -7.985 0.000 -2463.401 -1492.406
manufacturer__infiniti -1127.7709 327.814 -3.440 0.001 -1770.285 -485.257
manufacturer__jaguar -2454.3720 498.575 -4.923 0.000 -3431.578 -1477.166
manufacturer__jeep -558.4166 224.821 -2.484 0.013 -999.064 -117.769
manufacturer__kia -2261.7342 262.116 -8.629 0.000 -2775.481 -1747.987
manufacturer__land rover 3516.8861 2352.058 1.495 0.135 -1093.138 8126.910
manufacturer__lexus 1631.4187 267.911 6.089 0.000 1106.313 2156.524
manufacturer__lincoln -3305.2072 324.833 -10.175 0.000 -3941.879 -2668.536
manufacturer__mazda -1120.6321 286.446 -3.912 0.000 -1682.064 -559.200
manufacturer__mercedes-benz -1144.2408 242.191 -4.725 0.000 -1618.934 -669.548
manufacturer__mercury -3427.1097 386.127 -8.876 0.000 -4183.918 -2670.302
manufacturer__mini -2254.0713 383.237 -5.882 0.000 -3005.213 -1502.929
manufacturer__mitsubishi -2090.2062 334.122 -6.256 0.000 -2745.085 -1435.328
manufacturer__nissan -2744.4790 213.337 -12.865 0.000 -3162.619 -2326.339
manufacturer__pontiac -71.4204 314.549 -0.227 0.820 -687.934 545.094
manufacturer__porsche 5516.0865 488.195 11.299 0.000 4559.227 6472.946
manufacturer__ram 24.0140 233.348 0.103 0.918 -433.347 481.375
manufacturer__rover 1250.5503 401.178 3.117 0.002 464.243 2036.857
manufacturer__saturn -3189.6203 406.796 -7.841 0.000 -3986.939 -2392.302
manufacturer__subaru -1274.7857 250.798 -5.083 0.000 -1766.349 -783.223
manufacturer__tesla 1.666e+04 1474.001 11.305 0.000 1.38e+04 1.96e+04
manufacturer__toyota 1735.4495 203.308 8.536 0.000 1336.967 2133.932
manufacturer__volkswagen -1497.5133 248.747 -6.020 0.000 -1985.057 -1009.969
manufacturer__volvo -33.0345 357.933 -0.092 0.926 -734.582 668.513
cylinders__10 cylinders 1740.9182 281.560 6.183 0.000 1189.061 2292.775
cylinders__12 cylinders 4662.3925 980.073 4.757 0.000 2741.452 6583.333
cylinders__3 cylinders -2221.7926 538.565 -4.125 0.000 -3277.378 -1166.207
cylinders__4 cylinders -373.3274 180.077 -2.073 0.038 -726.278 -20.377
cylinders__5 cylinders -589.0464 309.329 -1.904 0.057 -1195.330 17.237
cylinders__6 cylinders 1284.6111 177.525 7.236 0.000 936.662 1632.560
cylinders__8 cylinders 3645.5774 181.694 20.064 0.000 3289.458 4001.697
cylinders__other 1056.6144 496.574 2.128 0.033 83.331 2029.898
fuel__diesel 9646.7616 224.765 42.919 0.000 9206.223 1.01e+04
fuel__electric 1509.8867 650.469 2.321 0.020 234.971 2784.803
fuel__gas -541.2850 206.372 -2.623 0.009 -945.774 -136.796
fuel__hybrid -255.4935 276.168 -0.925 0.355 -796.781 285.794
fuel__other -1153.9226 608.008 -1.898 0.058 -2345.616 37.770
transmission__automatic 5918.1730 96.853 61.105 0.000 5728.341 6108.005
transmission__manual 6939.5100 114.629 60.539 0.000 6714.838 7164.182
transmission__other -3651.7359 180.369 -20.246 0.000 -4005.259 -3298.213
drive__4wd 4900.6225 71.531 68.511 0.000 4760.422 5040.823
drive__fwd 1185.8007 76.229 15.556 0.000 1036.393 1335.209
drive__rwd 3119.5239 74.429 41.913 0.000 2973.643 3265.405
type__SUV -384.9482 89.354 -4.308 0.000 -560.082 -209.814
type__bus -1225.7479 581.733 -2.107 0.035 -2365.942 -85.554
type__convertible 2447.7848 162.320 15.080 0.000 2129.637 2765.932
type__coupe 2113.0129 125.592 16.824 0.000 1866.853 2359.172
type__hatchback -1392.7030 144.310 -9.651 0.000 -1675.551 -1109.855
type__mini-van 285.1080 176.874 1.612 0.107 -61.564 631.780
type__offroad 2493.9785 365.974 6.815 0.000 1776.670 3211.286
type__other -589.1339 302.269 -1.949 0.051 -1181.581 3.313
type__pickup 2436.9951 117.238 20.787 0.000 2207.208 2666.782
type__sedan -1798.9650 88.201 -20.396 0.000 -1971.838 -1626.092
type__truck 4659.3627 103.987 44.807 0.000 4455.549 4863.177
type__van 2216.4083 148.045 14.971 0.000 1926.241 2506.575
type__wagon -2055.2052 180.916 -11.360 0.000 -2409.800 -1700.610
paint_color__black 1266.0532 79.960 15.834 0.000 1109.332 1422.774
paint_color__blue 238.6170 90.366 2.641 0.008 61.500 415.734
paint_color__brown -433.0147 141.484 -3.061 0.002 -710.322 -155.707
paint_color__custom 880.7736 153.773 5.728 0.000 579.379 1182.168
paint_color__green -31.2454 137.516 -0.227 0.820 -300.776 238.286
paint_color__grey 539.5172 88.179 6.118 0.000 366.686 712.348
paint_color__orange 2860.5261 310.894 9.201 0.000 2251.175 3469.877
paint_color__purple 837.0642 402.256 2.081 0.037 48.643 1625.485
paint_color__red 210.5496 92.286 2.281 0.023 29.669 391.430
paint_color__silver 124.9586 83.470 1.497 0.134 -38.642 288.559
paint_color__white 1138.1976 76.061 14.964 0.000 989.118 1287.277
paint_color__yellow 1573.9502 262.474 5.997 0.000 1059.503 2088.398
state__ak 3537.8431 274.946 12.867 0.000 2998.950 4076.737
state__al 2263.1827 240.859 9.396 0.000 1791.100 2735.265
state__ar 701.4458 298.903 2.347 0.019 115.596 1287.295
state__az 123.0685 173.241 0.710 0.477 -216.484 462.621
state__ca 1000.8654 85.700 11.679 0.000 832.893 1168.837
state__co 50.0138 165.348 0.302 0.762 -274.068 374.096
state__ct -1362.9261 229.156 -5.948 0.000 -1812.070 -913.782
state__dc -1547.6719 307.149 -5.039 0.000 -2149.683 -945.661
state__de 527.3768 447.063 1.180 0.238 -348.866 1403.619
state__fl -1652.9823 98.020 -16.864 0.000 -1845.101 -1460.864
state__ga 940.5338 233.873 4.022 0.000 482.143 1398.924
state__hi 825.4556 326.839 2.526 0.012 184.853 1466.058
state__ia -155.2052 170.199 -0.912 0.362 -488.794 178.384
state__id 242.6718 185.845 1.306 0.192 -121.583 606.927
state__il -457.8805 172.399 -2.656 0.008 -795.782 -119.979
state__in -679.3883 194.390 -3.495 0.000 -1060.392 -298.385
state__ks 451.1065 190.140 2.372 0.018 78.433 823.780
state__ky 656.8721 213.429 3.078 0.002 238.553 1075.192
state__la -1108.0410 359.408 -3.083 0.002 -1812.478 -403.604
state__ma -1221.9875 170.137 -7.182 0.000 -1555.456 -888.519
state__md -632.5970 276.777 -2.286 0.022 -1175.078 -90.116
state__me 431.7520 283.408 1.523 0.128 -123.727 987.231
state__mi 1119.5608 133.734 8.372 0.000 857.443 1381.679
state__mn 137.0114 157.570 0.870 0.385 -171.825 445.848
state__mo 809.8195 257.045 3.150 0.002 306.012 1313.627
state__ms 529.7345 473.196 1.119 0.263 -397.728 1457.197
state__mt 1355.5604 229.414 5.909 0.000 905.909 1805.212
state__nc 1357.2805 136.219 9.964 0.000 1090.292 1624.269
state__nd 1925.2895 462.123 4.166 0.000 1019.531 2831.048
state__ne -207.1048 412.154 -0.502 0.615 -1014.924 600.715
state__nh -1293.0699 289.117 -4.472 0.000 -1859.739 -726.401
state__nj -1380.9116 158.536 -8.710 0.000 -1691.641 -1070.182
state__nm 45.7290 247.931 0.184 0.854 -440.215 531.673
state__nv -931.9171 305.720 -3.048 0.002 -1531.127 -332.707
state__ny -1415.0552 110.944 -12.755 0.000 -1632.506 -1197.604
state__oh -1672.3734 118.323 -14.134 0.000 -1904.287 -1440.460
state__ok 748.2047 201.246 3.718 0.000 353.764 1142.646
state__or 609.8799 151.490 4.026 0.000 312.959 906.801
state__pa -1673.6724 134.841 -12.412 0.000 -1937.959 -1409.385
state__ri -1402.8743 276.550 -5.073 0.000 -1944.912 -860.837
state__sc 994.9875 204.553 4.864 0.000 594.064 1395.912
state__sd 665.2882 386.432 1.722 0.085 -92.118 1422.694
state__tn 2362.1600 166.348 14.200 0.000 2036.118 2688.202
state__tx -396.0159 119.783 -3.306 0.001 -630.789 -161.242
state__ut 2646.7747 464.082 5.703 0.000 1737.176 3556.373
state__va -89.6070 154.361 -0.581 0.562 -392.154 212.940
state__vt 516.2568 221.729 2.328 0.020 81.668 950.845
state__wa 114.4259 216.419 0.529 0.597 -309.755 538.607
state__wi 41.7150 141.816 0.294 0.769 -236.244 319.674
state__wv -599.9337 575.004 -1.043 0.297 -1726.939 527.072
state__wy 1355.2962 574.887 2.357 0.018 228.520 2482.073
==============================================================================
Omnibus: 10684.901 Durbin-Watson: 1.999
Prob(Omnibus): 0.000 Jarque-Bera (JB): 43522.323
Skew: 0.677 Prob(JB): 0.00
Kurtosis: 6.512 Cond. No. 3.52e+17
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 1.03e-20. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.
y_pred = model.predict(X_test)
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, y_pred)
45024232.60457944
# compute the SHAP values for the linear model
explainer = shap.Explainer(model.predict, X_train)
# explainer process will take several minutes, please wait!
shap_values = explainer(X_test)
Permutation explainer: 746it [00:20, 21.03it/s]
shap.plots.bar(shap_values, max_display = 15)
shap.plots.heatmap(shap_values, max_display = 15)
shap.plots.beeswarm(shap_values, max_display = 15)
# explain sample 1
shap.plots.force(shap_values[0])
shap.plots.waterfall(shap_values[0])
used_car_df[used_car_df.index==y_test.index[0]]
| price | year | manufacturer | condition | cylinders | fuel | odometer | transmission | drive | size | type | paint_color | state | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 35697 | 6350 | 2005.0 | audi | excellent | 4 cylinders | gas | 105333.0 | automatic | fwd | compact | convertible | silver | ca |
# explain sample 2
shap.plots.force(shap_values[110])
used_car_df[used_car_df.index==y_test.index[110]]
| price | year | manufacturer | condition | cylinders | fuel | odometer | transmission | drive | size | type | paint_color | state | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 291226 | 6995 | 2011.0 | chevrolet | good | 8 cylinders | gas | 229183.0 | automatic | rwd | mid-size | van | white | oh |
shap.plots.waterfall(shap_values[110])